Jerry Licun Pages

Text-分析步骤

一些可能用到的libraries	from bs4 import BeautifulSoup as bsoup
	import re
	import os
	import nltk
	from nltk.collocations import *
	from itertools import chain
	import itertools
	from nltk.tokenize import RegexpTokenizer
	from nltk.tokenize import MWETokenizer
	import matplotlib.pyplot as plt
	%matplotlib inline 
	from nltk.corpus import reuters
	
Generate the 100 bigram cllocations：	bigram_measures = nltk.collocations.BigramAssocMeasures()
	bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(all_words)
	bigram_finder.apply_freq_filter(2)
	bigram_finder.apply_word_filter(lambda w: len(w) < 3)
	top_100_bigrams = bigram_finder.nbest(bigram_measures.pmi, 100) # Top-100 bigrams
	
Generate the TF-IDF vectors: 	from sklearn.feature_extraction.text import TfidfVectorizer
	tfidf_vectorizer = TfidfVectorizer(input = 'content', analyzer = 'word')
	tfidf_vectors = tfidf_vectorizer.fit_transform(patent_words)
	或者：
	from sklearn.feature_extraction.text import TfidfVectorizer
	tfidf = TfidfVectorizer(analyzer = "word")
	tfs = tfidf.fit_transform([' '.join(value) for value in tokenized_reuters.values()])
	
	vocab = vectorizer.get_feature_names()
	for word, weight in zip(vocab, tfs.toarray()[0]):
    	if weight > 0:
        	print (word, ":", weight)
        	
	输出到txt文档:
	save_file = open("patent_student.txt", 'w')
	vocab = tfidf_vectorizer.get_feature_names()
	cx = tfidf_vectors.tocoo()   #Return the coordinate representation of a sparse matrix
	for i,j,v in itertools.zip_longest(cx.row, cx.col, cx.data):
    	save_file.write(pids[i] + ',' + vocab[j] + ',' + str(v) + '\n')
    	
Most common words1.出现次数最多
	from nltk.probability import *
	fd_1 = FreqDist(words)
	fd_1.most_common(25)
2.出现文章最多
	words_2 = list(chain.from_iterable([set(value) for value in tokenized_reuters.values()]))
	fd_2 = FreqDist(words_2)
	fd_2.most_common(25)
3.出现次数少的词
	lessFreqWords = set([k for k, v in fdist.items() if v < 2])
	或者:
	lessFreqWords = set(fd_3.hapaxes())
	def removeLessFreqWords(fileid):
    	return (fileid, [w for w in tokenized_reuters[fileid] if w not in lessFreqWords])

查看某个词所出现的地方	nltk.Text(reuters.words()).concordance('net')

Creating Count Vectors	from sklearn.feature_extraction.text import CountVectorizer
	vectorizer = CountVectorizer(analyzer = "word") 
	data_features = vectorizer.fit_transform([' '.join(value) for value in 
tokenized_reuters.values()])
	
	vocab2 = vectorizer.get_feature_names()
	for word, count in zip(vocab, data_features.toarray()[0]):
    	if count > 0:
        	print (word, ":", count)

提取二元组	bigram_measures = nltk.collocations.BigramAssocMeasures()
	bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(all_words)
	bigram_finder.apply_freq_filter(2)
	bigram_finder.apply_word_filter(lambda w: len(w) < 3)
	top_100_bigrams = bigram_finder.nbest(bigram_measures.pmi, 100) # Top-100 bigrams
	或者:
	from nltk.util import ngrams
	bigrams = ngrams(reuters.words(), n = 2)
	fdbigram = FreqDist(bigrams)
	fdbigram.most_common()

The following code will find the best 100 bigrams using the PMI scores:(PMI找出最合适的bigram)
	bigram_measures = nltk.collocations.BigramAssocMeasures()
	finder = nltk.collocations.BigramCollocationFinder.from_words(reuters.words())
	finder.nbest(bigram_measures.pmi, 50)
Jerry Licun Pages

Text-分析步骤

一些可能用到的libraries

Generate the 100 bigram cllocations：

Generate the TF-IDF vectors:

Most common words

查看某个词所出现的地方

Creating Count Vectors

提取二元组

About Me

Pragramming

Data Science